library(here)
library(tidyverse)
library(minfi)
library(ggplot2)
library(ENmix)
library(stringr)
library(geneplotter)
library(ChAMP)
library(wateRmelon)
# setting project and data directories
change_here = function(new_path){
new_root = here:::.root_env
new_root$f = function(...){file.path(new_path, ...)}
assignInNamespace(".root_env", new_root, ns = "here")
}
change_here('/FACT_850K/')
setwd('/FACT_850K/')
sample_info = read.csv(here("FACT_IDs_850k.csv"))
# paths to idats
files_idat_raw = paste0(here(), '/EPIC/', sample_info$Sentrix_ID, '_', sample_info$Sentrix_Position)
# combining with pheno file
pheno = cbind(sample_info, files_idat_raw)
colnames(pheno)[ncol(pheno)] = 'Basename'
Reading idats
RGset = read.metharray.exp(targets=pheno, extended = TRUE)
Produces a PDF QC report for Illumina Infinium Human Methylation 450k arrays, which is useful for identifying failed samples.
qcReport(RGset, pdf="fact_850k_ControlProbes.pdf")
The ControlProbesPlot.pdf file is a 19-page report of the internal quality control information.
A number of functions are run sequentially on the RGset object.First outlier values are thresholded using fixMethOutliers. Then qc is performed using getQC and then sample specific sex is estimated using getSex.
# Creation Of A MethylSet Without Normalization (minifi)
Mset = preprocessRaw(RGset)
QC_minfi = minfiQC(Mset, fixOutliers = TRUE)
# Produce QC diagnostic plots
quartz()
plotQC(QC_minfi$qc)
quartz.save('PotentialOutliers.png', type = 'png', dpi = 200)
knitr::include_graphics("PotentialOutliers.png")
#Plot quality control plot, mdsplot, densityPlot, dendrogram.
pheno$Cig.Ever = factor(pheno$Cig.Ever)
champ.QC(beta = getBeta(Mset), pheno=pheno$Cig.Ever, mdsPlot=TRUE, densityPlot=TRUE, dendrogram=TRUE, PDFplot=TRUE, Rplot= FALSE,Feature.sel="None", resultsDir=here::here("QC_preFiltering"))
knitr::include_graphics("QC_preFiltering/raw_densityPlot.png")
knitr::include_graphics("QC_preFiltering/raw_mdsPlot.png")
knitr::include_graphics("QC_preFiltering/raw_SampleCluster.png")
Imputing missing beta values before performing SVD
table(is.na(getBeta(Mset)))
# FALSE TRUE
# 27737797 955
beta = getBeta(Mset)
betaImpute = champ.impute(beta=beta, pd=pheno, k=5, ProbeCutoff=0.2, SampleCutoff=0.1)
champ.SVD(beta = betaImpute$beta, rgSet=RGset, pd=pheno[,c(8,10,25,29,30,31,32,33,38,46)], RGEffect=TRUE, Rplot=FALSE, resultsDir=here::here("QC_preFiltering"))
knitr::include_graphics("QC_preFilteringSVDsummary.png")
Extract information for data quanlity controls: detection P values, number of beads and averaged bisulfite conversion intensity. The function can also identify low quality samples and probes, as well as outlier samples based on total intensity or beta value distribution.
qc = ENmix::QCinfo(RGset, distplot=FALSE)
# 0 samples with percentage of low quality CpG value greater than 0.05 or bisulfite intensity less than 8316.313
# 4806 CpGs with percentage of low quality value greater than 0.05
# Ploting qc_sample.jpg ...Done
# Ploting qc_CpG.jpg ...Done
# Identifying ourlier samples based on beta or total intensity values...
# After excluding low quality samples and CpGs
# 0 samples are outliers based on averaged total intensity value
# 0 samples are outliers in beta value distribution
# 0 outlier samples were added into badsample list
filtered = ChAMP::champ.filter(beta=getBeta(Mset), pd = pheno, beadcount = qc$nbead, detP = qc$detP, arraytype = "EPIC", SampleCutoff=0.05)
# [ Section 1: Check Input Start ]
# You have inputed beta for Analysis.
# pd file provided, checking if it's in accord with Data Matrix...
# pd file check success.
# Parameter filterDetP is TRUE, checking if detP in accord with Data Matrix...
# detP check success.
# Parameter filterBeads is TRUE, checking if beadcount in accord with Data Matrix...
# beadcount check success.
# parameter autoimpute is TRUE. Checking if the conditions are fulfilled...
# !!! ProbeCutoff is 0, which means you have no needs to do imputation. autoimpute has been reset FALSE.
# Checking Finished :filterDetP,filterBeads,filterMultiHit,filterSNPs,filterNoCG,filterXY would be done on beta.
# You also provided :detP,beadcount .
# [ Section 1: Check Input Done ]
# [ Section 2: Filtering Start >>
# Filtering Detect P value Start
# The fraction of failed positions per sample
# You may need to delete samples with high proportion of failed probes:
# Failed CpG Fraction.
# 201005010046_R01C01 0.0002318778
# 201005010046_R02C01 0.0002434140
# 201005010046_R03C01 0.0002480285
# 201005010046_R04C01 0.0002041909
# 201005010046_R05C01 0.0002999414
# 201005010046_R06C01 0.0002537966
# 201005010046_R07C01 0.0002791762
# 201005010046_R08C01 0.0002503357
# 200999660113_R01C01 0.0004879816
# 200999660113_R02C01 0.0004591411
# 200999660113_R03C01 0.0006114190
# 200999660113_R04C01 0.0009240502
# 200999660113_R05C01 0.0005733495
# 200999660113_R06C01 0.0006667928
# 200999660113_R07C01 0.0006437204
# 200999660113_R08C01 0.0006610247
# 200932680177_R01C01 0.0002584110
# 200932680177_R02C01 0.0004141499
# 200932680177_R03C01 0.0002422604
# 200932680177_R04C01 0.0003772340
# 200932680177_R05C01 0.0003357036
# 200932680177_R06C01 0.0003553152
# 200932680177_R07C01 0.0003553152
# 200932680177_R08C01 0.0004141499
# 201004220064_R01C01 0.0003968455
# 201004220064_R02C01 0.0003645442
# 201004220064_R03C01 0.0003195529
# 201004220064_R04C01 0.0003333964
# 201004220064_R05C01 0.0004129962
# 201004220064_R06C01 0.0003403181
# 201004220064_R07C01 0.0003287819
# 201004220064_R08C01 0.0003910774
# Filtering probes with a detection p-value above 0.01.
# Removing 3338 probes.
# If a large number of probes have been removed, ChAMP suggests you to identify potentially bad samples
# Filtering BeadCount Start
# Filtering probes with a beadcount <3 in at least 5% of samples.
# Removing 0 probes
# Filtering NoCG Start
# Only Keep CpGs, removing 2931 probes from the analysis.
# Filtering SNPs Start
# Using general EPIC SNP list for filtering.
# Filtering probes with SNPs as identified in Zhou's Nucleic Acids Research Paper 2016.
# Removing 97356 probes from the analysis.
# Filtering MultiHit Start
# Filtering probes that align to multiple locations as identified in Nordlund et al
# Removing 24 probes from the analysis.
# Filtering XY Start
# Filtering probes located on X,Y chromosome, removing 16806 probes from the analysis.
# Updating PD file
# Fixing Outliers Start
# Replacing all value smaller/equal to 0 with smallest positive value.
# Replacing all value greater/equal to 1 with largest value below 1..
# [ Section 2: Filtering Done ]
# All filterings are Done, now you have 746381 probes and 32 samples.
pheno = filtered$pd
betas = filtered$beta
mvals = filtered$M
filtered_CpG = rownames(betas)
unfiltered_CpG = rownames(getBeta(Mset))
outCpG = unfiltered_CpG[!(unfiltered_CpG %in% filtered_CpG)]
champ.QC(beta = betas, pheno=pheno$Cig.Ever, mdsPlot=TRUE, densityPlot=TRUE, dendrogram=TRUE, PDFplot=TRUE, Rplot= FALSE, Feature.sel="None", resultsDir=here::here("QC_postFiltering"))
save(qc, filtered, RGset, outCpG, pheno, mvals, betas, file = here("FACT_850k_QC.RData"))
knitr::include_graphics("QC_postFiltering/raw_densityPlot.png")
knitr::include_graphics("QC_postFiltering/raw_mdsPlot.png")
knitr::include_graphics("QC_postFiltering/raw_SampleCluster.png")
champ.SVD(beta = betas, rgSet=RGset, pd=pheno[,c(8,10,25,29,30,31,32,33,38,46)], RGEffect=TRUE, Rplot=FALSE, resultsDir=here::here("QC_postFiltering"))
knitr::include_graphics("QC_postFilteringSVDsummary.png")
RGset.noob <- preprocessNoob(RGset)
save(RGset.noob,file="RGsetnoob.RData")
# plot probes density
densityPlot(RGset, main = "density plots before and after preprocessing", pal="#440154FF", ylim=c(0,4.5))
densityPlot(RGset.noob, add = F, pal = "#FDE725FF")
# Add legend
legend("topleft", c("Noob","Raw"), lty=c(1,1), title="Normalization", bty='n', cex=1.3, col=c("#FDE725FF","#440154FF"))
quartz.save('noobDensity.png', type = "png", dpi = 300)
knitr::include_graphics('noobDensity.png')
cell_counts = estimateCellCounts2(RGset, compositeCellType = "Blood", referencePlatform = "IlluminaHumanMethylationEPIC", returnAll = TRUE, meanPlot = FALSE, verbose = TRUE)
write.csv(cell_counts$counts, file = "fact_850k_cellcounts.csv", row.names = F)
save(cell_counts,file = "fact_850k_cell_counts.RData")
# add cell estimates to pheno file
cellprop = cell_counts$counts
cellprop = data.frame(cellprop)
cellprop$Sample_Name = rownames(cellprop)
pheno = merge(pheno, cellprop, by = 'Sample_Name')
ids = colnames(RGset)
pheno = pheno[match(ids, pheno$Sample_Name),]
table(pheno$Sample_Name == colnames(RGset))
# TRUE
# 32
boxplot(cellprop[,c(1:6)]*100, col=1:ncol(cellprop),xlab="Cell type",ylab="Estimated %",main="Cell type distribution")
quartz.save('cellprop', type = "png", dpi = 300)
knitr::include_graphics('cellprop.png')
fun = preprocessFunnorm(RGset)
betas_fun = getBeta(fun)
betas_fun = betas_fun[!rownames(betas_fun) %in% outCpG,]
dim(betas_fun)
# 745578 32
betas_fun = Harman::shiftBetas(betas_fun, shiftBy=1e-4)
mvals_fun <- B2M(betas_fun)
champ.QC(beta = betas_fun, pheno= pheno$Cig.Ever, mdsPlot=TRUE, densityPlot=TRUE, dendrogram=TRUE, PDFplot=TRUE, Rplot= FALSE, Feature.sel="None", resultsDir=here::here("postFunnorm"))
champ.SVD(beta = betas_fun, rgSet=RGset, pd=pheno[,c(8,10,25,29,30,31,32,33,38,46)], RGEffect=TRUE, Rplot=FALSE, resultsDir=here::here("postFunnorm"))
save(betas_fun, mvals_fun, pheno, file = here("fact_850k_funnorm_data.RData"))
knitr::include_graphics("postFunnorm/raw_densityPlot.png")
knitr::include_graphics("postFunnorm/raw_mdsPlot.png")
knitr::include_graphics("postFunnorm/raw_SampleCluster.png")
knitr::include_graphics("postFunnormSVDsummary.png")
# quan = preprocessQuantile(RGset, fixOutliers = TRUE, quantileNormalize=TRUE, stratified=TRUE)
# quan = quan[!rownames(quan) %in% outCpG]
# quan = mapToGenome(quan)
# mvals = assays(quan)$M
# betas = M2B(mvals)
# newBetas = Harman::shiftBetas(betas, shiftBy=1e-4)
# # No beta values were found to be 0 or 1. No shifts made.
# mvals_Quantile = B2M(newBetas)
# betas_Quantile = M2B(mvals_Quantile)
# dim(betas_Quantile)
# # 411400 48
# champ.QC(beta = betas_Quantile, pheno= pheno$expGroup, mdsPlot=TRUE,
# densityPlot=TRUE, dendrogram=TRUE, PDFplot=TRUE, Rplot= FALSE,
# Feature.sel="None", resultsDir=here::here("postNorm"))
# champ.SVD(beta = betas_Quantile, rgSet=RGset, pd=pheno[-c(1,2,3,4,10,19,20)], RGEffect=TRUE,
# Rplot=FALSE, resultsDir=here::here("postNorm"))
# save(betas_Quantile, mvals_Quantile, pheno, file = here("fact_quantNorm_data.RData"))
# knitr::include_graphics("postNorm/raw_densityPlot.png")
# knitr::include_graphics("postNorm/raw_mdsPlot.png")
# knitr::include_graphics("postNorm/raw_SampleCluster.png")
# knitr::include_graphics("postNormSVDsummary.png")
# table(pheno$Sample_Name == colnames(mvals_fun))
# # TRUE
# # 32
# # PCA
# fact.pca = prcomp(mvals_fun, scale = TRUE)
# pairs(fact.pca$rotation[,1:5], pch = 19, cex = 0.5, col = factor(pheno$Cig.Ever))
# quartz.save('pca_batch.png', type = "png", dpi = 300)
# knitr::include_graphics("pca_batch.png")
# batch = factor(pheno$Batch)
# modcombat = model.matrix(~1, data=pheno)
# Mcombat = ComBat(dat = as.matrix(mvals_fun), batch = batch, mod = modcombat)
# # PCA
# fact.pca.combat = prcomp(Mcombat, scale = TRUE)
# pairs(fact.pca.combat$rotation[,1:5], pch = 19, cex = 0.5, col = factor(pheno$Batch))
# quartz.save('pca_batch_combat.png', type = "png", dpi = 300)
# save(Mcombat, pheno, file = here("fact_funnorm_combat_data.RData"))
# knitr::include_graphics("pca_batch_combat.png")